rm(list = ls())

library(pdftools)
library(stringr)
library(tidyverse)
library(haven)

#########################################

# set file paths (pull from Stata file 00_main.do)
raw_path          <- Sys.getenv("RAW")
intermediate_path <- Sys.getenv("INTERMEDIATE")
for_analysis_path <- Sys.getenv("FOR_ANALYSIS")
temp_path         <- Sys.getenv("TEMP")
output_path       <- Sys.getenv("OUTPUT")

usda_tableB6 <- file.path(raw_path, "usda_tableB6")
cleaned_usda <- file.path(intermediate_path, "cleaned_usda")

# set start and end years
years <- 2000:2016

for (yr in years) {
  # extract from text
  infile <- sprintf(file.path(usda_tableB6, "%dCharReport.pdf"), yr)
  txt <- pdf_text(infile)[1]
  txt2 <- unlist(str_split(txt, "[\\r\\n]+"))
  txt3 <- str_split_fixed(str_trim(txt2), "\\s{2,}", 11)
  
  # generate dataset; keep only the state name and SSI columns
  df <- as.data.frame(txt3)
  df <- df[, c("V1", "V6")]
  
  # cleaning
  df$V1 <- gsub("\\.", "", df$V1)
  df$V1 <- trimws(df$V1)
  names(df)[names(df) == "V1"] <- "state"
  names(df)[names(df) == "V6"] <- "snap_ssi_count"
  
  start <- which(grepl("Alabama", df$state))[1]
  end   <- tail(which(grepl("Wyoming", df$state)), 1)
  df <- df[start:end, ]  # remove rows above Alabama or below Wyoming
  
  outfile <- 
    sprintf(file.path(cleaned_usda, "snap_ssi_%d.dta"), yr)
  write_dta(df, outfile)
}
